head(df)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
dim(df)
## [1] 32 11
str(df)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(df)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
lower_bound <- quantile(df$mpg, 0.025)
upper_bound <- quantile(df$mpg, 0.975)
outlier_df <- which(df$mpg < lower_bound | df$mpg > upper_bound)
remove(lower_bound)
remove(upper_bound)
df[outlier_df,]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
remove(outlier_df)
df <- df[,colMeans(is.na(df)) < .9]
dim(df)
## [1] 32 11
The identified near zero variance predictors are the following:
# create a zero variance variable for demonstration purposes
df$one <- 1
near_zero_vars <- nearZeroVar(df)
df[, near_zero_vars]
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
After the exclusion of near-zero-variance predictors the data set looks as follows:
df <- df[, -c(near_zero_vars)]
remove(near_zero_vars)
head(df)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Collinearity is the situation where a pair of predictor variables have a substantial correlation with each other. In general, there are good reasons to avoid data with highly correlated predictors as it can result in highly unstable models and degraded predictive performance.
The darker areas in the correlation plot show variables which are correlated with each other.
# filter on numeric variables (in this case exclude 'mpg' as it represents the outcome, not a predictor)
predictors <- df[, -c(1)]
# iris[, !sapply(iris, is.numeric)]
# select non_numeric predictors, to be added back later
predictors_non_numeric <- predictors[, !sapply(predictors, is.numeric)]
predictors_numeric <- predictors[,sapply(predictors, is.numeric)]
correlations <- cor(predictors_numeric)
corrplot::corrplot(correlations, order = "hclust",tl.cex = 0.5)
Removing following predictors:
highCorr <- findCorrelation(correlations, cutoff = 0.75)
remove(correlations)
head(predictors_numeric[highCorr])
## cyl disp gear
## Mazda RX4 6 160 4
## Mazda RX4 Wag 6 160 4
## Datsun 710 4 108 4
## Hornet 4 Drive 6 258 3
## Hornet Sportabout 8 360 3
## Valiant 6 225 3
predictors_numeric <- predictors_numeric[, -highCorr]
remove(highCorr)
names(predictors_numeric)
## [1] "hp" "drat" "wt" "qsec" "vs" "am" "carb"
df <- cbind(df[, names(df) %in% names(predictors_numeric)],
subset(df, select = "mpg", drop = FALSE))
remove(predictors_numeric)
remove(predictors_non_numeric)
head(df)
## hp drat wt qsec vs am carb mpg
## Mazda RX4 110 3.90 2.620 16.46 0 1 4 21.0
## Mazda RX4 Wag 110 3.90 2.875 17.02 0 1 4 21.0
## Datsun 710 93 3.85 2.320 18.61 1 1 1 22.8
## Hornet 4 Drive 110 3.08 3.215 19.44 1 0 1 21.4
## Hornet Sportabout 175 3.15 3.440 17.02 0 0 2 18.7
## Valiant 105 2.76 3.460 20.22 1 0 1 18.1
Dimension of dataset after removal of highly correlated predictors:
dim(df)
## [1] 32 8
Review correlation plot again after removal of correlated predictors (reduced collinearity):
correlations <- cor(df[, -c(8)])
corrplot::corrplot(correlations, order = "hclust",tl.cex = 0.5)
remove(correlations)
remove(predictors)
The darker areas should be reduced as a result of having removed correlated predictors.
hist(x = df$mpg,
xlab = "mpg",
main = "Histogram of mpg",
breaks = sqrt(nrow(df))
)
ggplot(df) +
aes(x = mpg) +
geom_histogram(bins = 5L, fill = "#0c4c8a") +
theme_minimal()
df_lattice <- df
# convert to factor
df_lattice$vs <- as.factor(ifelse(test = df_lattice$vs == 0, yes = "v_shaped", no = "straight"))
df_lattice$am <- as.factor(ifelse(test = df_lattice$am == 0, yes = "automatic", no = "manual"))
# plot
lattice::histogram(~mpg | am, data = df_lattice)
lattice::densityplot(~mpg | am,
data = df_lattice,
groups = vs,
plot.points = FALSE,
auto.key = TRUE)
plot(x = df$hp, y = df$mpg)
lm_mpg_hp <- lm(mpg ~ hp, data = df)
abline(lm_mpg_hp)
# create quadratic linear model
lm_mpg_hp <- lm(mpg ~ hp + I(hp^2), data = df)
# sequence of x variable
newdat = data.frame(hp = seq(min(df$hp), max(df$hp), length.out = 100))
# predict
newdat$pred = predict(lm_mpg_hp, newdata = newdat)
remove(lm_mpg_hp)
plot(mpg ~ hp, data = df)
# plot prediction
lines(x = newdat$hp, y = newdat$pred)
remove(newdat)
ggplot(data = df_lattice, aes(x = hp, y = mpg, shape = vs, colour = am, size = wt)) +
geom_point()
ggplot(data = df_lattice, aes(x = hp, y = mpg)) +
geom_point() +
stat_smooth(method = lm, se = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
pairs(df)
caret::featurePlot(x = df[, -5],
y = df$vs,
plot = "pairs")
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data = df,progress = FALSE)
xyplot(mpg ~ hp | am,
data = df_lattice,
main = "Lattie Scatter Plot in R",
type = c("p", "g", "smooth"))
suppressWarnings(
featurePlot(x = df_lattice[, -c(5,6,8)],
y = df_lattice$mpg,
plot = "scatter",
type = c("p", "g", "smooth"),
layout = c(3, 2))
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number -0
splom(df)
df_sorted_mpg <- df_lattice[order(df_lattice$mpg),]
plot(1:length(df_sorted_mpg$mpg), y = df_sorted_mpg$mpg)
lines(x = 1:length(df_sorted_mpg$mpg),
y = df_sorted_mpg$mpg,
pch = 18,
col = "blue",
type = "b",
lty = 2)
# Add a legend to the plot
legend("topleft",
legend=c("Line 1"),
col=c("blue"),
lty = 1:2,
cex=0.8)
ggplot(data = df_sorted_mpg, aes(
x = 1:length(df_sorted_mpg$mpg),
y = mpg,
colour = vs,
# group = supp,
# fill = xxx,
linetype = am
)) +
geom_line() +
ylim(0, max(df_sorted_mpg$mpg) * 1.1) +
expand_limits(y = 0) +
geom_point()
remove(df_sorted_mpg)
boxplot(df$mpg,
ylab = "mpg"
)
ggplot(df) +
aes(x = "", y = mpg) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
Box Plot Outliers:
out <- boxplot.stats(df$mpg)$out
out_ind <- which(df$mpg %in% c(out))
remove(out)
df[out_ind, ]
## [1] hp drat wt qsec vs am carb mpg
## <0 rows> (or 0-length row.names)
remove(out_ind)
bwplot(mpg ~ am | vs, df_lattice)
# remove(df_lattice)
Start with all variables and exclude irrelevant ones.
lm_df <- lm(mpg ~ ., data = df[, -c(5,1,2,7,6)])
summary(lm_df)
##
## Call:
## lm(formula = mpg ~ ., data = df[, -c(5, 1, 2, 7, 6)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3962 -2.1431 -0.2129 1.4915 5.7486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.7462 5.2521 3.760 0.000765 ***
## wt -5.0480 0.4840 -10.430 2.52e-11 ***
## qsec 0.9292 0.2650 3.506 0.001500 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.596 on 29 degrees of freedom
## Multiple R-squared: 0.8264, Adjusted R-squared: 0.8144
## F-statistic: 69.03 on 2 and 29 DF, p-value: 9.395e-12
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(lm_df)
par(mfrow=c(1,1)) # Change back to 1 x 1
This plot shows if residuals have non-linear patterns. There could be a non-linear relationship between predictor variables and an outcome variable and the pattern could show up in this plot if the model doesn’t capture the non-linear relationship. If you find equally spread residuals around a horizontal line without distinct patterns, that is a good indication you don’t have non-linear relationships.
This plot shows if residuals are normally distributed. Do residuals follow a straight line well or do they deviate severely? It’s good if residuals are lined well on the straight dashed line.
It’s also called Spread-Location plot. This plot shows if residuals are spread equally along the ranges of predictors. This is how you can check the assumption of equal variance (homoscedasticity). It’s good if you see a horizontal line with equally (randomly) spread points.
This plot helps us to find influential cases if any. Not all outliers are influential in linear regression analysis (whatever outliers mean). Even though data have extreme values, they might not be influential to determine a regression line. That means, the results wouldn’t be much different if we either include or exclude them from analysis. They follow the trend in the majority of cases and they don’t really matter; they are not influential. On the other hand, some cases could be very influential even if they look to be within a reasonable range of the values. They could be extreme cases against a regression line and can alter the results if we exclude them from analysis. Another way to put it is that they don’t get along with the trend in the majority of the cases.
Unlike the other plots, this time patterns are not relevant. We watch out for outlying values at the upper right corner or at the lower right corner. Those spots are the places where cases can be influential against a regression line. Look for cases outside of a dashed line, Cook’s distance. When cases are outside of the Cook’s distance (meaning they have high Cook’s distance scores), the cases are influential to the regression results. The regression results will be altered if we exclude those cases.
lm_df <- lm(mpg ~ wt + I(wt^2) + qsec, data = df)
summary(lm_df)
##
## Call:
## lm(formula = mpg ~ wt + I(wt^2) + qsec, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2200 -1.2521 -0.6288 0.9357 5.1761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.6418 5.6768 5.750 3.59e-06 ***
## wt -12.4331 2.0842 -5.965 2.01e-06 ***
## I(wt^2) 1.0730 0.2970 3.613 0.001174 **
## qsec 0.8599 0.2236 3.846 0.000634 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.182 on 28 degrees of freedom
## Multiple R-squared: 0.8816, Adjusted R-squared: 0.8689
## F-statistic: 69.5 on 3 and 28 DF, p-value: 4.345e-13
# create dataframe with predictions
# create x input values
pred_df <- data.frame(wt = seq(min(df$wt), max(df$wt), length.out = 100),
qsec = seq(min(df$qsec), max(df$qsec), length.out = 100))
# predict
pred_df$pred <- predict(lm_df, newdata = pred_df)
# plot actual data
plot(mpg ~ wt, data = df)
# plot prediction
# pred_df <- pred_df[order(pred_df$wt),]
lines(x = pred_df$wt, y = pred_df$pred, lwd = 1, lty = 2, col = c("blue"))
# Add a legend to the plot
legend("topleft",
legend=c("LM Prediction"),
col=c("blue"),
lty = 2,
cex=0.8)
remove(pred_df)
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(lm_df)
par(mfrow=c(1,1)) # Change back to 1 x 1
remove(lm_df)
remove(df_lattice)
Residuals vs Fitted plot is cleaner now after having considered the quadratic relationship.
index_train <- caret::createDataPartition(df$mpg,
p = 0.8,
list = FALSE)
df_train <- df[index_train,]
df_test <- df[-index_train,]
remove(index_train)
preproc_alg <- caret::preProcess(df_train,
thresh = 0.95,
method = c("BoxCox",
"center",
"scale",
"pca"))
df_train_prpr <- stats::predict(preproc_alg, df_train)
head(df_train_prpr)
## PC1 PC2 PC3 PC4 PC5
## Mazda RX4 Wag -0.1669268 1.7180059 -0.07991657 -0.4140913 -0.87960495
## Datsun 710 -2.4244826 0.2022331 0.47306897 0.9208873 0.05303057
## Hornet 4 Drive -1.0122492 -1.9941515 0.51515573 0.3753043 0.36782282
## Hornet Sportabout 1.1130121 -0.4317811 0.83220698 -0.4993654 0.27773113
## Valiant -0.5935989 -2.6417352 0.63921368 0.7373085 -0.11366940
## Duster 360 2.3883275 0.3207399 0.17311649 -0.1641141 0.60605433
preproc_alg$rotation
## PC1 PC2 PC3 PC4 PC5
## hp 0.4207703 0.13617289 0.01625092 0.31325119 0.35841135
## drat -0.3029680 0.43234519 -0.52876009 -0.16433618 0.30553159
## wt 0.3976895 -0.25688437 -0.26896143 0.15082129 -0.26674989
## qsec -0.2853227 -0.49852303 -0.34027042 -0.01154455 -0.45592925
## vs -0.3795434 -0.23342279 -0.37562357 0.55737082 0.36320363
## am -0.2292635 0.55509952 0.14188844 0.55764412 -0.54899896
## carb 0.3423247 0.32900243 -0.58867527 -0.23756272 -0.25358917
## mpg -0.4216583 0.09568444 0.15575784 -0.41703926 -0.03924999
preproc_alg$std
## hp drat wt qsec vs am carb
## 1.34306317 0.09317022 0.43097395 0.10436273 0.50395263 0.49734746 0.61279798
## mpg
## 0.30237150
what - the type of plot: “scree” produces a bar chart of standard deviations:
df_prcomp <- prcomp(df_train, center = TRUE,scale. = TRUE)
plot(df_prcomp)
#calculate total variance explained by each principal component
var_explained <- df_prcomp$sdev^2 / sum(df_prcomp$sdev^2)
#create scree plot
library(ggplot2)
qplot(c(1:length(var_explained)), var_explained) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot") +
ylim(0, 1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
remove(var_explained)
var_explained_caret <-
sapply(df_train_prpr, sd) / sum(sapply(df_train_prpr, sd))
var_explained_caret
## PC1 PC2 PC3 PC4 PC5
## 0.42991713 0.27020519 0.12676675 0.09245660 0.08065433
sum(var_explained_caret)
## [1] 1
remove(var_explained_caret)
pcaCharts <- function(x) {
x.var <- x$sdev ^ 2
x.pvar <- x.var / sum(x.var)
print("proportions of variance:")
print(x.pvar)
par(mfrow = c(2, 2))
plot(
x.pvar,
xlab = "Principal component",
ylab = "Proportion of variance explained",
ylim = c(0, 1),
type = 'b'
)
plot(
cumsum(x.pvar),
xlab = "Principal component",
ylab = "Cumulative Proportion of variance explained",
ylim = c(0, 1),
type = 'b'
)
screeplot(x)
screeplot(x, type = "l")
par(mfrow = c(1, 1))
}
pcaCharts(df_prcomp)
## [1] "proportions of variance:"
## [1] 0.59745505 0.25590584 0.05587410 0.02655431 0.02405089 0.01947729 0.01223229
## [8] 0.00845021
remove(df_prcomp)
remove(pcaCharts)
df_train$vs_factor <- as.factor(ifelse(df_train$vs == 0, "v_shaped", "straight"))
panelRange <- extendrange(df_train_prpr[, 1:3])
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
upperp <- function(...)
{
args <- list(...)
circ1 <- ellipse(diag(rep(1, 2)), t = 1)
panel.xyplot(circ1[,1], circ1[,2],
type = "l",
lty = trellis.par.get("reference.line")$lty,
col = trellis.par.get("reference.line")$col,
lwd = trellis.par.get("reference.line")$lwd)
circ2 <- ellipse(diag(rep(1, 2)), t = 2)
panel.xyplot(circ2[,1], circ2[,2],
type = "l",
lty = trellis.par.get("reference.line")$lty,
col = trellis.par.get("reference.line")$col,
lwd = trellis.par.get("reference.line")$lwd)
circ3 <- ellipse(diag(rep(1, 2)), t = 3)
panel.xyplot(circ3[,1], circ3[,2],
type = "l",
lty = trellis.par.get("reference.line")$lty,
col = trellis.par.get("reference.line")$col,
lwd = trellis.par.get("reference.line")$lwd)
panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts)
}
splom(as.data.frame(df_train_prpr[, 1:3]),
groups = df_train$vs_factor,
type = c("p", "g"),
as.table = TRUE,
lower.panel = function(...){},
upper.panel = upperp,
auto.key = list(columns = 2),
prepanel.limits = function(x) panelRange)
remove(panelRange)
df_train$vs_factor <- NULL
trainControl()$method
## [1] "boot"
trainControl()$number
## [1] 25
trainControl()$repeats
## [1] NA
trainControl()$p
## [1] 0.75
df_train_sq <- df_train
df_train_sq$wt2 <- df_train_sq$wt^2
# check CPU in terminal: 'lscpu'
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
lm_model_caret <- caret::train(
mpg ~ .,
data = df_train_sq,
preProc = c("BoxCox", "center", "scale", "pca"),
method = "lm"
)
stopCluster(cl)
remove(cl)
lm_model_caret
## Linear Regression
##
## 28 samples
## 8 predictor
##
## Pre-processing: Box-Cox transformation (6), centered (8), scaled (8),
## principal component signal extraction (8)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 28, 28, 28, 28, 28, 28, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2.650173 0.8446887 2.168273
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# insert quadratic element in df_test (in line with df_train)
df_test$wt2 <- df_test$wt^2
# Apply the transformations:
predicted <- predict(lm_model_caret, df_test)
df_test$observed <- df_test$mpg
df_test$predicted <- round(predicted, 1)
df_test$residual <- round(df_test$predicted - df_test$observed, 1)
df_test[, c("predicted", "observed", "residual")]
## predicted observed residual
## Mazda RX4 21.7 21.0 0.7
## Merc 280C 20.0 17.8 2.2
## AMC Javelin 17.9 15.2 2.7
## Lotus Europa 27.8 30.4 -2.6
caret::R2(pred = predict(lm_model_caret, df_train_sq),
obs = df_train_sq$mpg)
## [1] 0.8804422
caret::RMSE(pred = predict(lm_model_caret, df_train_sq),
obs = df_train_sq$mpg)
## [1] 2.05518
caret::R2(pred = df_test$predicted,
obs = df_test$observed)
## [1] 0.9980506
caret::RMSE(pred = df_test$predicted,
obs = df_test$observed)
## [1] 2.201136
plot(x = df_test$predicted,
y = df_test$observed)
abline(a=0, b=1)
plot(x = predict(lm_model_caret, df_test),
y = df_test$observed)
abline(a=0, b=1)
ggplot(df_test, aes(x = predict(lm_model_caret, df_test), y = df_test$observed)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
labs(x = 'Predicted Values', y = 'Actual Values', title = 'Predicted vs. Actual Values')